
clear all 
set more off 
set maxvar 15000 
clear matrix

/*-----------------------------------------------------------------------------
	Purpose: Clean and code all cross sections of NLSY79 from 1979-2016.
	Creates: nlsy79_newxsec.dta

	Note: In order to observe family income around 40 for most respondents,
	      the 2002 (cleaned) cross section is selected for main analysis. As 
	      the median age in 1979 cross section is 17, the median age in the 
	      2002 cross section will be around 40. 
----------------------------------------------------------------------------- */

cd "$Mydirectory1/1_DataSources/NLSY79/"

* Import supplementary dta files and save dta file
	quietly run ./rawdata/employment_status_byintdate-value-labels.do
	quietly run ./rawdata/marstat_expanded-value-labels.do
	quietly run ./rawdata/parentsocc_Rage14-value-labels.do

* Import main data and run NLS-provided clean up
run "./rawdata/NLSY79-ISM-value-labels.do" 

*------------------------------------------------------------------------ 
*	Merge in supplementary variables
*------------------------------------------------------------------------ 

	* Employment status (available in every year)
		sort CASEID_1979
		merge 1:1 CASEID_1979 using ./rawdata/employment_status_byintdate.dta
		drop _merge 

	* Marital status (expanded)
		sort CASEID_1979
		merge 1:1 CASEID_1979 using ./rawdata/marstat_expanded.dta
		drop _merge

	* Parental occupations when R was 14 & who R lived with at 14 
		sort CASEID_1979
		merge 1:1 CASEID_1979 using ./rawdata/parentsocc_Rage14.dta
		drop _merge

*------------------------------------------------------------------------ 
*	Create dummy variable for 2002 respondents 
*------------------------------------------------------------------------ 
	gen interviewed2002=AGEATINT_2002 !=-5

*------------------------------------------------------------------------ 
*	RENAME AND CLEAN VARIABLES
*------------------------------------------------------------------------ 
*---------------------------
* Interview information 
*---------------------------
	* Interview month (available all years)
		renvars CURDATE_M_*, map("doim" + substr("@", -4,4)) 
		qui:recode  doim???? (-5 -4 -3 -2 -1 = 3) //If interview month is missing, assume it's March.

	* Interview year (available 1993-2016)
		renvars CURDATE_Y_*,  map("a_doiy" + substr("@", -4,4)) 
		// Interview year = survey year (by default)

	* Cross sectional sample weights 
		renvars C_SAMPWEIGHT_*, map("xweight" + substr("@", -4,4))  

	* Recoding missings 
		qui:mvdecode a_doiy* xweight*, mv(-5 -4 -3 -2 -1) 		

*----------------
* ID variables
*----------------
	* Household id
		ren HHID_1979 hhid

	* Identification code
		ren CASEID_1979 id 

	* Sample identification code
		ren SAMPLE_ID_1979 samp_id

*----------------
* Unions 
*----------------
	/* Note:  Survey question is:  
	          "Is R in a union or 
	          employee assoication?"   
	*/

		renvars QES_88A_*_1979  QES_88A_*_1994 QES_88A_*_1996 QES_88A_*_1998 QES_88A_*_2000  QES_88A_*_2002  ///			
				QES_88A_*_2004 QES_88A_*_2006 QES_88A_*_2008 QES_88A_*_2010 QES_88A_*_2012 QES_88A_*_2014 QES_88A_*_2016 ///
				, map("alljobs_union" + substr("@", -7,.)) 
		qui:mvdecode alljobs_union*, mv(-5 -4 -3 -2 -1)
		sum alljobs_union*

		forvalues y = 1979/2016{
			capture confirm variable alljobs_union01_`y' //check: alljobs_union01_ variables exist
			if !_rc {
				//grab first non-missing union status for each R, by year
				egen firstj_union`y'=rowfirst(alljobs_union*_`y') 
			}
		}	
 		
 		assert  firstj_union2002==. if interviewed2002==0 

/*-----------------------
* Sociodemographics
*------------------------*/

*Respondent
	* Sex 
		clonevar sex= SAMPLE_SEX_1979  
		gen female = SAMPLE_SEX_1979 == 2 

	* Birth year (format: last 2 digits of a given year)
		gen doby = 1900 + Q1_3_A_Y_1979 

	* Birth month 
		gen dobm =  Q1_3_A_M_1979

	* Race (several measures used)
		* Racial/ethnic cohort from 1978 screener  (Hispanic, Black, non-Hispanic & non-Black)
			clonevar race = SAMPLE_RACE_78SCRN 
			gen black=race==2

		* Racial/ethnic origin with which R identifies most closely (detailed)
			clonevar sr_race=FAM_31_1979
			qui: mvdecode race sr_race , mv(-5 -4 -3 -2 -1) 

		* 1st or only racial/ethnic origin (detailed)
			clonevar sr1_race=FAM_30_1_1979
			qui: mvdecode race sr1_race , mv(-5 -4 -3 -2 -1) 

		* Create self-reported measure of race
			/*Note: See https://nlsinfo.org/content/cohorts/nlsy79/topical-guide/household/race-ethnicity-immigration-data
					for more info on race and ethnicity in NLSY79. */
			clonevar selfr_race=sr_race
			replace selfr_race=sr1_race if mi(selfr_race)

	* Age
	    /*Note: No age restriction fpr this dataset 
	            because all respondents are 30-50. */
		renvars AGEATINT_*, map("age" + substr("@", -4,4)) 
		qui: mvdecode age* , mv(-5 -4 -3 -2 -1) 

	* Marital status (collapsed)
		renvars MARSTAT_COL_*, map("marital_status" + substr("@", -4,4)) 
		qui: mvdecode marital_status* , mv(-5 -4 -3 -2 -1) 
		
	* Foreign born 
		renvars FAM_2A_*,map("cntrybirth"+substr("@",-4,.))
		qui: mvdecode cntrybirth*, mv(-5 -4 -3 -2 -1)
		gen foreignborn = .
		replace foreignborn =1 if cntrybirth1979==2
		replace foreignborn =1 if cntrybirth1983==0
		replace foreignborn=0 if cntrybirth1979==1|cntrybirth1983==1

	* R's region of residence (during first survey year)	
		gen  grewup_south = REGION_1979 ==3
		lab var grewup_south "REGION OF RES IN 79"
		lab define grewup_south 0 "LIVES IN NON-SOUTH" 1 "LIVES IN SOUTH"
		lab values grewup_south grewup_south

	* State/Region where Respondent grew up
		renvars REGION_*,map("region_residence"+substr("@",-4,.))
		qui: mvdecode region_residence*, mv(-5 -4 -3 -2 -1)

*Parents
	* Who R lived with around the age of 14
		ren FAM_7_1979 Rwho_livew_age14
		qui:mvdecode Rwho_livew_age14, mv(-5 -4 -3 -2 -1)
	
	* Foreign born 
		clonevar cntrybirth_mom1979= FAM_15_1979
		clonevar cntrybirth_dad1979= FAM_21_1979
		qui:mvdecode cntrybirth_*, mv(-5 -4 -3 -2 -1)
		gen fatherforeign=cntrybirth_dad1979==2   //other counrty
		gen motherforeign=cntrybirth_mom1979==2

	* Identifying siblings
		gen sib_interviewed_1979 =.

 		forvalues i =1/5 {
			replace sib_interviewed_1979 =1 if inlist(RELC`i'_1979,6,7,39,40,59,60,64,65) & sib_interviewed_1979==. //tag first evidence of R being a sibling
		}

		replace sib_interviewed_1979 =0 if sib_interviewed_1979==. 
 
*-------------------------------
* Education
*------------------------------- 	

	* Respondents (available in all years from 1979-2016)
		renvars HGCREV*, map("hgc"+ substr("@", -4,4)) //Note: NLS documentation recommends that we clean the revised HGC vars in each year.
		qui: mvdecode hgc*, mv(-5 -4 -3 -2 -1  95)  // Note: 95=ungraded

	* Parents (available in 1979)
		rename HGC_MOTHER_1979 hgc_mom1979
		rename HGC_FATHER_1979 hgc_dad1979
		qui: mvdecode hgc_mom1979 hgc_dad1979, mv(-5 -4 -3 -2 -1) 

*-------------------------------
* HH size
*------------------------------- 	
	/* Note: 
		(1) The famsize variable was created by the nlsy79. 
	            They note that foster relationships, partners, boarders, 
	            guardians, and other individuals are not considered family 
	            members in the creation of this variable. Only relationships 
	            by blood, adoption, or marriage are included. 
	        (2) Based on NLS documentation, the respondent is always counted
	            when constructing famsize. 
	*/

	ren FAMSIZE_* R_hhsize_plusR*
	
	foreach num of numlist 79/93 94(2)98 0(2)16   {
	
		if inrange(`num',79,98) {
			tab R_hhsize_plusR19`num', m
		}
		
		else  {
			local n: display %02.0f `num'
			tab R_hhsize_plusR20`n', m
		}
	}
	
	mvdecode R_hhsize_plusR* , mv(-5 -4 -3 -2 -1) 
	
	foreach num of numlist 94(2)98 0(2)16  {
	
	//TOPCODED VALUES
	/* Note: In the years below, household size should be topcoded at 10, 
	         but for some reason there are a couple values >10. In other surveys, 
	         any values exceeding the topcoded value have been recoded to have the 
	         topcoded value. The same will be done here.*/
		if inrange(`num',94,98)  {
			
			tab R_hhsize_plusR19`num',m 
			replace R_hhsize_plusR19`num' = 10 if (R_hhsize_plusR19`num'>10 &  R_hhsize_plusR19`num'<.) 
			tab R_hhsize_plusR19`num',m 
			
			}
			
		if inrange(`num',0,16)	{
		
		local n: display %02.0f `num'
		
		tab R_hhsize_plusR20`n',m 
			replace R_hhsize_plusR20`n' = 10 if (R_hhsize_plusR20`n'>10 &  R_hhsize_plusR20`n'<.)	
		tab R_hhsize_plusR20`n',m 
		}
	}

*-------------------------------
* Employment 
*-------------------------------
* Respondents 

	* Self-employed
		renvars COWALL_EMP_*,  map("worker_type_"+ substr("@",-7,.))
		qui:mvdecode worker_type_* , mv(-5 -4 -3 -2 -1)

* Father--not available

*-------------------------------
* Occupation 
*-------------------------------
/*Note: For info on how NLS occupation codes match 
        Census occupation codes, see 
	    //https://nlsinfo.org/content/cohorts/nlsy79/topical-guide/employment/occupations 
*/

*Parents
	//Whether R's mother/stepmother worked when R was 14
		clonevar workmom_Rage14 = FAM_9_1979
	
	//Occupation of R's mother/stepmother when R was 14
		clonevar occu_mom= FAM_9A_1979
		qui:mvdecode occu_mom, mv(-2 -1) //Note: -1 and -2 are refusal and dk, respectively.
		
	//Whether R's father/stepfather worked when R was 14
		clonevar workdad_Rage14 = FAM_11_1979
		
	//Occupation of R's father/stepfather when R was 14
		clonevar occu_dad= FAM_11A_1979 
		qui:mvdecode occu_dad, mv(-2 -1) 
		
* Respondents (available 1996-2016)
	//1970 Census occ classifications for all jobs 
		renvars OCCALL_EMP_*_1996 OCCALL_EMP_*_1998 OCCALL_EMP_*_2000 ,  map("occuall_"+ substr("@",-7,.))
	  	qui:mvdecode occuall_* , mv(-5 -4 -3 -2 -1)
	//2000 Census occ classifications for all jobs
		renvars OCCALL_EMP_*_2002 OCCALL_EMP_*_2004 OCCALL_EMP_*_2006 OCCALL_EMP_*_2008,  map("occuall_"+ substr("@",-7,.))
	  	qui:mvdecode occuall_* , mv(-5 -4 -3 -2 -1)
	//Identify first job
		foreach y in 1996 1998 2000 2002 2004 2006 2008 {
			egen occufisrt_`y'=rowfirst(occuall_01_`y' occuall_02_`y' occuall_03_`y' occuall_04_`y' occuall_05_`y')
		}


*----------------------------
*   Family Income
*------------------------------- 	

	*total net family income in past calendar year (before taxes), yrs1979-2016
		renvars TNFI_TRUNC_*, map("faminc" + substr("@", -4,4)) 
		qui: mvdecode faminc* , mv(-5 -4 -3 -2 -1) 

*------------------------------------------------------------------------ 
*					RESHAPE TO LONG
*------------------------------------------------------------------------
local intvars="doim* a_doiy* xweight*"
local socvars="sex female doby dobm race region_residence* grewup_south foreignborn sr_race sr1_race selfr_race black age* marital_status*   "
local parentvars="workdad_Rage14 workmom_Rage14 Rwho_livew_age14 hgc_mom1979 hgc_dad1979 occu_mom occu_dad fatherforeign motherforeign"
local hhmemvars="R_hhsize_plusR* sib_interviewed_1979 RELC* IDC*"
local eduoccvars="hgc* employed_* worker_type_01_2002 occuall_* occufisrt_*"
local othvars="interviewed2002 firstj_union* faminc*"
keep hhid id samp_id `intvars' `socvars' `parentvars' `hhmemvars' `eduoccvars' `othvars'

/*qui:*/ reshape long doim a_doiy xweight age marital_status marital_status_expanded   ///
	hgc employed_ faminc region_residence  ///
	occuall_01_ occuall_02_ occuall_03_ occuall_04_ occuall_05_ occufisrt_ ///
		firstj_union R_hhsize_plusR  , i(samp_id hhid id) j(doiy)

ren employed_ employed
gen year=doiy
label var year "Survey year" 

*------------------------------------------------------------------------ 
*						GENERATE NEW VARS
*------------------------------------------------------------------------
des

*----------------------
* interview information	
*----------------------
	sum doim doiy xweight

*-----------
* id vars 
*-----------
	unique id 
	unique id year
	sum id
*----------------
* Unions 
*----------------

	clonevar unionR = firstj_union
	sum unionR
	lab var unionR "Wages set by union"
	lab define yesno 1 "Yes" 0 "No"
	lab values unionR yesno

*----------------
* Veterans 
*----------------

	gen veteran=.

*------------------
* Demographics
*------------------
* Respondents
	*Gender 
		fre sex  
		lab var sex "R's sex"
	*Age
	 	sum  dobm doim

		//Correct age 
		gen ageA = ym(doiy, doim) - ym(doby, dobm) 
		replace ageA = floor(ageA/12) 

		replace age = ageA if abs(doiy - age - doby) > 2 | mi(age) 
		/*Note: There are some individuals with 
		        same age in two survey years. */
		gen agesq = age * age

	* Birth cohorts		
		clonevar dob=doby   
		tab dob, m
		label var dob "Year of birth"

	* Decade
		gen decade = 10 * floor(dob/10)
		tab decade,m
		label var decade "Decade of birth"

		//Generate dummies for each decade
		tab decade, gen(decade_)

	* Race
		rename race race_nlsy79
		rename black black_nls79
		/*As is the case in other surveys, 
		 Cuban, Chicano, Mexican, Mexican-American, 
		 Puerto Rican, other Hispanic or Latino, 
		 and European will be coded as white. */
		
		gen race =.
		replace race =1 if (race_nlsy79==1 | race_nlsy79==3)
		replace race =2 if race_nlsy79==2
		
		replace race =. if inlist(selfr_race,2,4,8,9,10,13,14,26) & race==1 
		tab race,m 
		
		gen black = race==2 if race<.
		tab black,m 
		
		
	* Married
		gen married=marital_status==2 if marital_status<.
		
	* Never married 
		gen never_married=marital_status_expanded==0 if marital_status_expanded<.
		tab never_married, m
		
	* Divorced 
		gen divorced=marital_status_expanded==3 if marital_status_expanded<.
		tab divorced, m
		
	* Widowed 
		gen widowed=marital_status_expanded==6 if marital_status_expanded<.
		tab widowed, m
		
	* Separated 
		gen separated=marital_status_expanded==2 if marital_status_expanded<.
		tab separated, m 
	
*------------------
* Foreign Born 
*------------------

*respondents
		fre foreignborn
		lab var foreignborn "R was born outside the US"

*------------------
* Region R currently resides in * 
*------------------

*respondents
		/*Note: See //https://www.nlsinfo.org/content/cohorts/nlsy79/other-documentation/codebook-supplement/nlsy79-attachment-100-geographic
		        for more info */
		fre region
		clonevar region4=region_residence  
		lab var region4 "Region of Residence"
		lab define region_l 1"Northeast" 2 "Midwest" 3 "South" 4"West"
		lab values region4 region_l

*-------------------------------
* HH size
*------------------------------- 

	//Fix value label of R_hhsize_plusR
	label define vlT5770700 10 "10", modify
	tab R_hhsize_plusR,m 
	
	gen R_hhsize_minusR = R_hhsize_plusR -1
	tab R_hhsize_minusR,m 
	
	label var R_hhsize_plusR "total # of persons in R's hh (including R)"
	label var R_hhsize_minusR "total # of persons in R's hh (NOT including R)"	

*---------------------------
* Employment 
*--------------------------

*Respondent
	* Self-employed (available 1996-2008)
	gen selfemployed= (worker_type_01_2002==4 | worker_type_01_2002==5) if worker_type_01_2002!=. 

*---------------------------
* Education 
*--------------------------
*Respondent
	* Categorical edu var
		fre hgc /* highest grade/yr completed & got credit */
		gen eduR= 0 if hgc==0 
		replace eduR=1 if hgc>=1 & hgc<8  /* some grade school */ 
		replace eduR=2 if hgc==8  /* completed 8th grade */ 
		replace eduR=3 if hgc>8 & hgc<12  /* some HS */ 
		replace eduR=4 if hgc==12 /* 4 years of HS */ 
		replace eduR=5 if hgc>12 & hgc<16  /* 1-3 years of college */ 
		replace eduR=6 if hgc>15 & hgc<.  /* 4 or more years of college (like BA) */ 
		tab eduR, m

	* Years of school
		clonevar  yrsschool = hgc
		label var yrsschool "Years of school" 	
		tab yrsschool, m nol 

	* Years of school (binned)
		gen yrsschool_bin=. 
		replace yrsschool_bin = 0 if yrsschool==0 
		replace yrsschool_bin = 6 if yrsschool>0 & yrsschool<8  
		replace yrsschool_bin = 8 if yrsschool==8 
		replace yrsschool_bin = 10 if yrsschool>8 & yrsschool<12 
		replace yrsschool_bin = 12 if yrsschool==12 
		replace yrsschool_bin = 14 if yrsschool>12 & yrsschool<16 
		replace yrsschool_bin = 16 if yrsschool>=16 & yrsschool<=20 
		tab yrsschool_bin , m
		label var yrsschool_bin "Years of school, binned" 

	* High school degree
		gen hs_ed = eduR>=4 if !mi(eduR)
		tab hs_ed, m
		label var hs_ed "HS educated"

	* College education degree
		gen coll_ed = eduR>=6  if  !mi(eduR)
		tab coll_ed, m 
		label var coll_ed "Coll educated" 

*-------------------*

*Parents 

	clonevar dad_ed_raw=hgc_dad1979
	clonevar mom_ed_raw=hgc_mom1979

* Educational categories for parents 
		foreach name in mom dad { 
			gen edu_`name'=0 if `name'_ed_raw==0 //none
			replace edu_`name'=1 if `name'_ed_raw>0 & `name'_ed_raw<8 //some grade school
			replace edu_`name'=2 if `name'_ed_raw==8 //completed 8th grade
			replace edu_`name'=3 if `name'_ed_raw>8 & `name'_ed_raw<12 //some HS 
			replace edu_`name'=4 if `name'_ed_raw==12 //4 years of HS
			replace edu_`name'=5 if `name'_ed_raw>12 & `name'_ed_raw<16 //some college
			replace edu_`name'=6 if `name'_ed_raw==16 | `name'_ed_raw==17  //college
			replace edu_`name'=4 if `name'_ed_raw==25 
			label var edu_`name' "Educational categories for `name'" 
		} 

	* Binned
		foreach name in mom dad { 
			gen edu_`name'_bin=0 if edu_`name'==0 
			replace edu_`name'_bin=6 if edu_`name'==1 
			replace edu_`name'_bin=8 if edu_`name'==2 
			replace edu_`name'_bin=10 if edu_`name'==3 
			replace edu_`name'_bin=12 if edu_`name'==4 
			replace edu_`name'_bin=14 if edu_`name'==5 
			replace edu_`name'_bin=16 if edu_`name'==6 
			tab edu_`name'_bin, m
			label var edu_`name'_bin "`name' Years of school from bins"

		* High school degree		
			gen `name'_hs_ed = edu_`name'>=4 if edu_`name'<.
			tab `name'_hs_ed, m

		* College education degree
			gen `name'_coll_ed = edu_`name'>=6 if edu_`name'<.
			tab `name'_coll_ed, m 
			label var `name'_hs_ed "`name' HS educated" 
			label var `name'_coll_ed "`name' College educated"
		}

	* Years of school		
		foreach a in dad mom {
			rename `a'_ed_raw yrsschool_`a'
			tab yrsschool_`a', m
		} 

*-----------------------------------
* Crosswalk occupations (parents) 
*-----------------------------------
	
	// Both parental occs match 3-digit 1970 Census codes
	tab occu_dad, m
	tab occu_mom, m

	foreach name in dad mom {
	
		clonevar census1970=occu_`name'
		merge m:1 census1970 using "../Crosswalks/Crosswalk_1970Census_toANES" 
		assert (census1970==. | census1970==-4 | census1970==-3) if _merge==1
		drop if _merge==2
		drop _merge

		//Rename fatheroccej so that crosswalk can be brought in again
		if "`name'" =="dad" rename fatheroccej father_occ1950ej 
		if "`name'" =="mom" rename fatheroccej motheroccej

		//Get rid of old census1980 var so that next person's occ can be crosswalked
		drop census1970 
	}

********************************************************************************************************
 * For fathers/mothers: Fix occupations for self-employed businessmen, managers, or officials before merging income scores * 
*******************************************************************************************************
/* Note: Not able to find class of worker for R's mother or father
         when R was growing up. */	

*----------------------------------------
* Crosswalk occupations (respondents) 
*----------------------------------------
/* Note: (1) See https://www.census.gov/content/dam/Census/library/publications/2020/demo/acs-tp78.pdf
         for more info on how Census occ codes change over time.
         (2) 1996-2000: will use the 1970 occ census codes
         (3) 2002-2004: will use the 2010 occ census codes
*/

	/* Note: Per Census documentation above, it's necessary to add a 0 
	         to the end of occ codes in 2002 */
	replace occufisrt_=occufisrt_*10  if year==2002 // now 4 digits
	sum occufisrt_  if inrange(year, 2002,2008)

	//Make sure that respondents in year 2002-2008 have 2010 Census occ codes
		preserve
		keep  if inrange(year, 2002,2008)
		keep hhid id year occufisrt_ 
		clonevar occ2002=occufisrt_ 
		merge m:1 occ2002 using ../Crosswalks/Occ_Census2002_to_Census_2010.dta , keep(match) nogen
		keep hhid id  year occ2010
		tempfile occ2010
		save `occ2010'
		restore
		merge 1:1 hhid id  year using  `occ2010', nogen  
	
	//Assign 1970 Census code to respondents in years 1996-2000 
	clonevar occ1970=occufisrt_  if inrange(year, 1996,2000)
	
	// Create 1 (uncoarsened) occ variable for respondents
	gen occufirst=.
	replace occufirst=occ1970  if inrange(year, 1996,2000)
	replace occufirst=occ2010  if inrange(year, 2002,2008)
	
	gen occu_youth_atlst1=occufirst


	// Crosswalk respondent occs based on 1970 Census to 1950 ANES occs
		clonevar census1970=occu_youth_atlst1 if inrange(year,1996,2000)
		merge m:1 census1970 using "../Crosswalks/Crosswalk_1970Census_toANES"
			assert census1970==. if _merge==1
			drop if _merge==2
			drop _merge			
			rename fatheroccej occej_from1970 	

	// Crosswalk respondent occs based on 2010 Census to 1950 ANES occs 
		clonevar census2010=occu_youth_atlst1 if inrange(year,2002,2008)
		merge m:1 census2010 using "../Crosswalks/Crosswalk_2010Census_toANES" 
			assert census2010==. if _merge==1
			drop if _merge==2
			drop _merge		
			rename fatheroccej_2010 occej_from2010 		

	// Create 1 coarsened occ variable for respondents
		capture drop  fatheroccej  
		gen fatheroccej=.
		replace fatheroccej=occej_from1970 if inrange(year,1996,2000)
		replace fatheroccej=occej_from2010 if inrange(year,2002,2008)

*********************************************************************************************
* For respondents: Fix occupations for self-employed businessmen, managers, or officials  * 
*********************************************************************************************
	replace fatheroccej=21 if fatheroccej==28 & selfemployed==1 

* Rename respondent and father occupation
	rename fatheroccej occRej 
	rename father_occ1950ej fatheroccej 
	
*------------------------------------------
* Head of hh dummies + father_imputed dummy	
*------------------------------------------
	
	gen headofhh_father = (inrange(Rwho_livew_age14,11,25)) if Rwho_livew_age14<.
	tab headofhh_father, m
	
	gen headofhh_mother = (inlist(Rwho_livew_age14,51,52,91)) if Rwho_livew_age14<.
	tab headofhh_mother, m
	
	gen headofhh_othermale = (inrange(Rwho_livew_age14,31,45)) if Rwho_livew_age14<.
	tab headofhh_othermale,m 
	
	gen headofhh_otherfemale = (inlist(Rwho_livew_age14,53,54,93)) if Rwho_livew_age14<.
	tab headofhh_otherfemale,m 
	
	label var headofhh_father "Head of hh when R was growing up was R's father"
	label var headofhh_othermale "Head of hh when R was growing up was some other male (not R's father)"
	label var headofhh_otherfemale "Head of hh when R was growing up was some other female (not R's mother)"
	label var headofhh_mother "Head of hh when R was growing up was R's mother"

	//Create alternate dummy for hh head being R's father. 
	     /* Note: When R reports occupation of their father but 
	              does not specify who they lived with when
	              growing up, will assume that R lived with father.
	     */	
	gen headofhh_father_imputed = headofhh_father
	replace headofhh_father_imputed =1 if fatheroccej!=. & Rwho_livew_age14==.
	tab headofhh_father_imputed,m 
	tab headofhh_father, m
	label var headofhh_father_imputed "Impute dad when parent occ != missing & no info about hh head at age 16"	
	
*-----------------------------------------------------------------------------------------------------
*  DUMMIES FOR WHEN WE KNOW WHY DAD OR MOM DIDN'T WORK (I.E. WHY FATHEROCCEJ/MOTHEROCCEJ IS MISSING)
*-----------------------------------------------------------------------------------------------------
	gen father_notworking =.
	replace father_notworking =1 if fatheroccej==. & workdad_Rage14==0 & inrange(Rwho_livew_age14,11,45) //replace as 1 if we know father didn't work when R was 14 and if R reports living with his father or a male figure.
	replace father_notworking =0 if fatheroccej!=.
	tab father_notworking ,m 
		
	gen mother_notworking =.
	replace mother_notworking =1 if motheroccej==. & workmom_Rage14==0 & inlist(Rwho_livew_age14,11,12,13,14,21,22,23,24,31,32,33,34,41,42,43,44,51,52,53,54,91,93) //replace as 1 if we know mother didn't work when R was 14 and if R reports living with his mother or a female figure.
	replace mother_notworking =0 if motheroccej!=.
	tab mother_notworking,m 
	
	/*Note: There is a skip-logic to the parental occu[ation 
	        questions. Anyone who answered "no" to the previous 
	        question---"did your mom/mother figure or father/father 
	        figure work for pay when you were 14 years old?"----
	        will have a missing value for occupation. */
	
* Variable for father being either farm laborer or operator
	gen fatherfarm=0
	replace fatherfarm=. if fatheroccej==.
	replace fatherfarm=1 if fatheroccej==71 | fatheroccej==81	
	
*---------------------------
* Family  Income
*----------------------------
* Respondents
 	sum faminc, d
 	lab var faminc "Family Income, no-binned"

*---------------------------
* Family  Income - Binned
*----------------------------

* Respondents
	/* The 2006 GSS wave is firsted used as a reference for the bins.
	   Some changes are then made to the bin structure in order
	   to end up with 10-12 roughly evenly sized bins. */
	gen fam_inc=.
		replace fam_inc=6000*0.75 if faminc>=0 & faminc<6000 
		replace fam_inc=10500 if faminc>= 6000 & faminc < 15000 
		replace fam_inc=20000 if faminc>= 15000 & faminc < 25000
		replace fam_inc=30000 if faminc>= 25000 & faminc < 35000
		replace fam_inc=40000 if faminc>= 35000 & faminc < 45000
		replace fam_inc=50000 if faminc>= 45000 & faminc < 55000
		replace fam_inc=60000 if faminc>= 55000 & faminc < 65000
		replace fam_inc=70000 if faminc>= 65000 & faminc < 75000
		replace fam_inc=80000 if faminc>= 75000 & faminc < 85000
		replace fam_inc=95000 if faminc>= 85000 & faminc < 105000 
		replace fam_inc=12000 if faminc>= 105000 & faminc < 135000 
		replace fam_inc=135000*1.25 if faminc>= 135000 & !mi(faminc)
		replace fam_inc=. if year==2002 & interviewed2002!=1 

	tab fam_inc if year==2002 & interviewed2002==1 
	label var fam_inc "R's Family Income, binned (based on midpoints of each bin)" 

/*Note: The suffix "_son" is used to match the variable names in other datasets. 
        All respondents (i.e., male and female) are given a value for these variables.*/	
        gen bottomcoded_son = fam_inc==6000*0.75 if fam_inc<. 
		tab bottomcoded_son, m 
		label var bottomcoded_son "Respondent family income, bottom coded" 

		gen topcoded_son = fam_inc==135000*1.25 if fam_inc<. 
			tab topcoded_son, m 
		label var topcoded_son "Respondent family income, top coded"

capture drop __*

*------------------------------------------------------------------------ 
* Save a dataset here for Appendix E exercise (drop table)
*------------------------------------------------------------------------ 

	save "./wrkdata/nlsy79_4droptable.dta", replace

*------------------------------------------------------------------------ 
* Jácome et al. vs Mazumder et al benchmarking exercise
*------------------------------------------------------------------------ 

	preserve
		ren id CASEID
		keep CASEID fatheroccej motheroccej father_notworking grewup_south selfr_race dob 
		sort CASEID

		//Preliminary step: Confirm that fatheroccej, motheroccej, grewup_south, selfr_race, and dob are constant by id
		foreach var of varlist fatheroccej motheroccej father_notworking grewup_south selfr_race dob	{
			if "`var'"=="fatheroccej" local num 1
			if "`var'"=="motheroccej" local num 2
			if "`var'"=="grewup_south" local num 3
			if "`var'"=="selfr_race" local num 4 
			if "`var'"=="dob" local num 5
			if "`var'"=="father_notworking" local num 6  

			egen tag`num' = tag(CASEID `var'), missing //Tag each distinct value of variable, by unique id
			by CASEID: egen tag`num'_total = total(tag`num'==1) 
			tab tag`num'_total, m //Confirmed: only 1 value of each variable for each unique id
			
			ren `var' `var'_nlsy79
		} 
		drop tag*

		//Keep one record for each unique id 	
		bysort CASEID: keep if _n==1 
		count 
		save "./wrkdata/MD_benchmarkingexercise_nlsy79.dta", replace
	restore

capture drop __*

*------------------------------------------------------------------------ 
*    OUR SAMPLE SELECTION (to harmonize across datasets)
*------------------------------------------------------------------------ 

*Restrictions	
	keep if (foreignborn==0 | foreignborn==.)
	keep if inrange(age,30,50)

rename xweight xweight_nlsy79
clonevar famid=hhid
sort hhid id year
egen id_nlsy79=group(hhid id)
unique id_nlsy79  

save "./wrkdata/nlsy79_foranalysis_v2.dta", replace

*------------------------------------------------------------------------ 
*		Create a cross section
*------------------------------------------------------------------------ 

use "./wrkdata/nlsy79_foranalysis_v2.dta", clear
capture drop __*

* Keep only cross-section respondents
	keep if inrange(samp_id,1,8)

* Keep respondents aged 30-50 and interviewed in 2002 
	sort id year
	keep if (year==2002 & interviewed2002==1) & inrange(age,30,50)
	unique id 

	capture drop __*
	unique id 
	duplicates list id  //no duplicates

* Convert family income variable to 1950 dollars
	gen year_CPI = year-1
	merge m:1 year_CPI using "../CPI/CPI_deflator.dta"
	drop if _merge==2
	drop _merge
	
	gen fam_inc_real =.
	replace fam_inc_real = fam_inc * deflator 
	label var fam_inc_real "Family income (bins), in 1950 dollars"

	gen lnfaminc = ln(fam_inc_real)
	label var lnfaminc "Logged family income (bins)"

	drop CPI year_CPI deflator

	ren id origid_nlsy
	label var origid_nlsy "survey-given id var (nlsy79)"

capture drop __*
save "./wrkdata/nlsy79_newxsec.dta", replace



